Note: Open scripts.Rproj first, then script. To easily use relative paths, click the down button next to knit and then click “Knit Directory –> Project Directory”. This should make loading and saving files much easier.

GO Term and KEGG Enrichment Analysis

Run GO Term enrichment analysis using results from WGCNA analysis (i.e., after running the 2a-WGCNA.Rmd script). Red text indicates regions that require the user to modify. Regardless, the user should check over all code blocks to ensure that everything is running correctly.

1. Load packages

Load packages

library(goseq)
library(tidyverse)
library(GSEABase)
library(data.table)
library(ggplot2)
library(cowplot)
library(patchwork)

2. Set variables

Change file names and conditions where appropriate.

# Input unfiltered data
treatmentinfo.file <- "salmon.numreads.WGCNA_samples.tsv"
wgcna_results.file <- "salmon.numreads.WGCNA_results.tsv"
annotation.file <- "Pocillopora_acuta_KBHIv2.pep.GOs.tsv"

# Output DEG results
GO_term_sig.file <- "salmon.numreads.WGCNA_results.GOsig.tsv"
KEGG_sig.file <- "salmon.numreads.WGCNA_results.KEGGsig.tsv"

# Cutoff for significant GO term p-values
GO_enrich_pvalue.cutoff <- 0.05
KEGG_enrich_pvalue.cutoff <- 0.05

# KEGG ID description file - provided with script
KEGG_IDs_Descriptions.file <- "2b-KEGG_IDs_Descriptions.tsv"

3. Load, clean, and pre-processing datasets

Load the input file containing the treatment information.

treatmentinfo <- read.csv(treatmentinfo.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM") # Read in file
head(treatmentinfo)
##              sample_id unit lib_type         fq1         fq2 species plug_id
## 1 Pacuta_ATAC_TP1_1043    1       pe SRR14610932 SRR14610932  Pacuta    1043
## 2 Pacuta_ATAC_TP1_1775    1       pe SRR14610931 SRR14610931  Pacuta    1775
## 3 Pacuta_ATAC_TP1_2363    1       pe SRR14610930 SRR14610930  Pacuta    2363
## 4 Pacuta_ATAC_TP3_1041    1       pe SRR14610929 SRR14610929  Pacuta    1041
## 5 Pacuta_ATAC_TP3_1471    1       pe SRR14610928 SRR14610928  Pacuta    1471
## 6 Pacuta_ATAC_TP3_1637    1       pe SRR14610927 SRR14610927  Pacuta    1637
##   treatment timepoint            reef ploidy  group num
## 1      ATAC         1 Lilipuna.Fringe      3 Group4   1
## 2      ATAC         1      Reef.42.43      2 Group5   1
## 3      ATAC         1         Reef.18      3 Group3   1
## 4      ATAC         3      Reef.11.13      3 Group2   1
## 5      ATAC         3      Reef.35.36      3 Group3   1
## 6      ATAC         3      Reef.11.13      3 Group2   1
# Check we have the right column names
headers <- c("sample_id")
if( all(headers %in% colnames(treatmentinfo)) ){
  print(paste(treatmentinfo.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
  stop(paste(treatmentinfo.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## [1] "salmon.numreads.WGCNA_samples.tsv has the required columns: sample_id"

Load the input file containing WGCNA results.

wgcna_results <- as.data.frame(read.csv(wgcna_results.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM")) # Read in file

# Check we have the right column names
headers <- c("gene_id")
if( all(headers %in% colnames(wgcna_results)) ){
  print(paste(wgcna_results.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
  stop(paste(wgcna_results.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## [1] "salmon.numreads.WGCNA_results.tsv has the required columns: gene_id"

Filter wgcna_results to just the module that we want to analyze. Adjust filtering as required.

wgcna_results <- wgcna_results %>%
  filter(moduleColor %in% c("darkred"))
head(wgcna_results)
##                                        gene_id moduleColor    GS.group
## 1  Pocillopora_acuta_KBHIv2___RNAseq.g18206.t1     darkred -0.03207267
## 2      Pocillopora_acuta_KBHIv2___TS.g21588.t1     darkred -0.03402285
## 3 Pocillopora_acuta_KBHIv2___RNAseq.g21467.t1a     darkred  0.03428403
## 4      Pocillopora_acuta_KBHIv2___TS.g26006.t2     darkred  0.04265057
## 5  Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1     darkred  0.04354234
## 6    Pocillopora_acuta_KBHIv2___RNAseq.21015_t     darkred  0.05834933
##   p.GS.group   MM.green     p.MM.green MM.yellow    p.MM.yellow  MM.magenta
## 1  0.7313859 -0.2954921 0.001218716723 0.2099302 0.023102245593  0.26224908
## 2  0.7157328 -0.2479159 0.007038059802 0.2297861 0.012692763678  0.09474403
## 3  0.7136453 -0.3544783 0.000088101563 0.2664554 0.003685433089  0.24342122
## 4  0.6479650 -0.4257462 0.000001706014 0.4192948 0.000002533992  0.11552494
## 5  0.6411104 -0.2887780 0.001590567087 0.1643817 0.076554432284  0.04444531
## 6  0.5320361 -0.3272705 0.000316173209 0.1528483 0.099920641962 -0.06917879
##   p.MM.magenta MM.lightcyan p.MM.lightcyan MM.darkgrey p.MM.darkgrey
## 1  0.004285127   0.01901123      0.8387846   0.1525828 0.10051821477
## 2  0.309582686   0.06888993      0.4604929   0.2290520 0.01298816345
## 3  0.008177837  -0.01634581      0.8611422   0.2588572 0.00483076887
## 4  0.214851733   0.05536727      0.5532389   0.3843956 0.00001878854
## 5  0.634200409   0.03476737      0.7097875   0.2063496 0.02560647413
## 6  0.458610037   0.12976126      0.1631920   0.1402469 0.13150876798
##      MM.brown    p.MM.brown MM.darkgreen     p.MM.darkgreen MM.royalblue
## 1  0.32342826 0.00037518863    0.4611408 0.0000001673496227  -0.11387554
## 2  0.19088254 0.03925173212    0.3879501 0.0000154775243793  -0.05407675
## 3  0.28967623 0.00153546715    0.5408257 0.0000000003080091  -0.20387922
## 4  0.38772901 0.00001566628    0.5420219 0.0000000002766267  -0.10426734
## 5  0.03016025 0.74684380557    0.3361713 0.0002108435316810  -0.14774859
## 6 -0.01177189 0.89975589405    0.3179398 0.0004772341356053  -0.03564825
##   p.MM.royalblue MM.darkturquoise p.MM.darkturquoise MM.darkred
## 1     0.22151536       0.02196884         0.81412619  0.6962332
## 2     0.56253977       0.01032036         0.91206346  0.4729281
## 3     0.02746629       0.04126119         0.65870314  0.7333859
## 4     0.26323947       0.07163859         0.44275039  0.7420472
## 5     0.11189692       0.18356344         0.04758099  0.3859649
## 6     0.70277582      -0.15767693         0.08953358  0.3060560
##                    p.MM.darkred MM.midnightblue p.MM.midnightblue   MM.grey60
## 1 0.000000000000000002920906258      0.14812762       0.110970022  0.36123754
## 2 0.000000072736116091717396760      0.22157673       0.016354363  0.19388954
## 3 0.000000000000000000005332528      0.23208925       0.011803784  0.28256291
## 4 0.000000000000000000001051349      0.29338354       0.001325931  0.32256203
## 5 0.000017252112707076900423872      0.28859276       0.001602152 -0.01420550
## 6 0.000791173268813867049120692     -0.06177483       0.508192283 -0.03379452
##     p.MM.grey60 MM.lightgreen     p.MM.lightgreen   MM.purple    p.MM.purple
## 1 0.00006296495    0.56538600 0.00000000003107586 -0.06524957 0.484583541264
## 2 0.03620305820    0.33446523 0.00022808740092415 -0.19925906 0.031254068495
## 3 0.00202373068    0.57298303 0.00000000001470413 -0.25680082 0.005191028459
## 4 0.00038982649    0.46727947 0.00000010885875478 -0.40954227 0.000004538237
## 5 0.87917594729    0.26026345 0.00459744180687308 -0.37312798 0.000034231090
## 6 0.71755954761    0.07702289 0.40913770282975298 -0.08028197 0.389541744824
##   MM.greenyellow p.MM.greenyellow MM.lightyellow p.MM.lightyellow   MM.pink
## 1     0.32903701 0.00029202818190     0.01837345    0.84412334870 0.6289653
## 2     0.26800680 0.00348401430944    -0.14569225    0.11703102359 0.4418650
## 3     0.36695766 0.00004710474873    -0.16886146    0.06875928239 0.6857097
## 4     0.48808936 0.00000002375842    -0.36703550    0.00004691732 0.6211864
## 5     0.09611212 0.30261664656232    -0.27502849    0.00269054602 0.3841575
## 6    -0.09098295 0.32926766901003     0.06146262    0.51034234633 0.2544099
##                   p.MM.pink      MM.tan      p.MM.tan      MM.red   p.MM.red
## 1 0.00000000000003110185686  0.29598188 0.00119497096 -0.17422162 0.06029145
## 2 0.00000061216834424629705  0.29569838 0.00120866320 -0.18702807 0.04347152
## 3 0.00000000000000001468793  0.34205200 0.00016023365 -0.18499196 0.04584913
## 4 0.00000000000007880754650  0.42720300 0.00000155844 -0.16376855 0.07767460
## 5 0.00001903264233562839897  0.13479527 0.14733721057 -0.05450836 0.55942087
## 6 0.00563986248076380012467 -0.01315947 0.88801274683  0.01873769 0.84107345
##   MM.turquoise p.MM.turquoise   MM.orange p.MM.orange     MM.blue  p.MM.blue
## 1 -0.102299695      0.2724045 -0.04878553 0.601432476 -0.03470709 0.71026826
## 2 -0.051650668      0.5802235  0.15083561 0.104520769 -0.11747855 0.20714509
## 3 -0.055545072      0.5519634  0.06179423 0.508058851 -0.12066546 0.19500173
## 4 -0.114997466      0.2169671  0.27593099 0.002601376 -0.07547774 0.41862697
## 5  0.004607088      0.9606814  0.01953099 0.834439136 -0.24360792 0.00812743
## 6  0.034040411      0.7155924 -0.14703976 0.113646491 -0.18516055 0.04564824
##      MM.black  p.MM.black    MM.cyan     p.MM.cyan   MM.salmon p.MM.salmon
## 1 -0.26540413 0.003827795 -0.3693122 0.00004173412  0.22517828 0.014648281
## 2 -0.24758335 0.007117293 -0.2118528 0.02184644486  0.14315313 0.123620186
## 3 -0.26118846 0.004449504 -0.2861430 0.00176272684  0.17699429 0.056258016
## 4 -0.26149576 0.004401307 -0.1675263 0.07101213558  0.27799643 0.002407257
## 5 -0.14045907 0.130920079 -0.1076121 0.24814666541 -0.03144554 0.736443456
## 6 -0.09145676 0.326744653 -0.1883824 0.04194766139 -0.18839045 0.041938749

Load the input file containing gene annotations (GO terms, KEGG IDs, and gene/protein lengths).

# Annotations
annot <- read.csv(annotation.file, header=TRUE, sep="\t", fileEncoding="UTF-8-BOM")

# Check we have the right column names
headers <- c("gene_id", "GO_IDs", "KEGG_IDs", "length")
if( all(headers %in% colnames(annot)) ){
  paste(paste(annotation.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
  stop(paste(annotation.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## [1] "Pocillopora_acuta_KBHIv2.pep.GOs.tsv has the required columns: gene_id, GO_IDs, KEGG_IDs, length"
# Select only relevant information
Go.ref <- subset(annot, select= c(gene_id, length))

# Merge wgcna_results by available annotations
Go.ref <- merge(wgcna_results, Go.ref, by = "gene_id")
Go.ref <- unique(Go.ref)
dim(Go.ref)
## [1] 108  55

Set ID and gene length vectors, and make a binary matrix indicating which genes are differentially expressed. These are used as input to nullp, which for calculates a Probability Weighting Function for DEGs.

# Make ID and length vectors
IDvector <- annot$gene_id
lengthVector <- annot$length

# Get binary list indicating which genes are in WGCNA results and which are not out of all genes
wgcna.genes <- as.integer(annot$gene_id %in% Go.ref$gene_id)
names(wgcna.genes) <- annot$gene_id
print(paste("Number of WGCNA genes:     ", length(wgcna.genes[wgcna.genes == 1]), sep=''))
## [1] "Number of WGCNA genes:     108"
print(paste("Number of NON WGCNA genes: ", length(wgcna.genes[wgcna.genes == 0]), sep=''))
## [1] "Number of NON WGCNA genes: 24804"
# Weight vector by length of gene
pwf <- nullp(DEgenes=wgcna.genes, id=IDvector, bias.data=lengthVector)
## Warning in pcls(G): initial point very close to some inequality constraints

Prepare GO term dataframe.

# Cleanup GO terms and split multiple GO terms per line, into one GO term per line
GO.annot <- annot %>%
  subset(select=c(gene_id, GO_IDs)) %>%
  drop_na() %>% # Remove rows with missing GO terms or gene IDs
  filter(GO_IDs!="-") # Remove rows with missing values indicated by '-'
splitted <- strsplit(as.character(GO.annot$GO_IDs), "; ") # Split into multiple GO ids
GO.terms <- data.frame(v1 = rep.int(GO.annot$gene_id, sapply(splitted, length)), v2 = unlist(splitted)) # List all genes with each of their GO terms in a single row
colnames(GO.terms) <- c("gene_id", "GO.ID")

# Cleanup single-line GO term dataframe
GO.terms <- GO.terms %>%
  mutate(GO.ID = gsub(" ", "", GO.ID)) %>% # Remove spaces from GO terms - in case any exist by mistake
  mutate(GO.ID = as.character(GO.ID)) %>%
  mutate(GO.ID = factor(GO.ID, levels=unique(GO.ID))) %>%
  mutate(gene_id = factor(gene_id, levels=unique(gene_id))) %>%
  unique()

# Print stats
print(paste("No rows in 'GO.terms': ", dim(GO.terms)[1], sep=''))
## [1] "No rows in 'GO.terms': 1655777"
print(paste("Avg GO IDs per gene: ", nrow(GO.terms) / length(unique(GO.terms$gene_id)), sep=''))
## [1] "Avg GO IDs per gene: 142.26110490592"
head(GO.terms)
##                                        gene_id      GO.ID
## 1 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0001101
## 2 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0001932
## 3 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0001933
## 4 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0002009
## 5 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0002020
## 6 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0002165

Find enriched GO terms, “selection-unbiased testing for category enrichment amongst significantly expressed genes for RNA-seq data”.

GOall <- goseq(pwf, GOref$gene_id, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns

Find only enriched GO terms that are statistically significant at cutoff.

GOall.filtered <- GOall %>% 
  filter(over_represented_pvalue < GO_enrich_pvalue.cutoff | under_represented_pvalue < GO_enrich_pvalue.cutoff) %>% # Filter GO enrichment results by p-value
  arrange(ontology, over_represented_pvalue, -numDEInCat) %>% # Reorder
  mutate(term = factor(term, levels = unique(term))) %>% # Make 'terms' column a factor
  mutate(dir = if_else(over_represented_pvalue < GO_enrich_pvalue.cutoff, 
                       "Over", 
                       if_else(under_represented_pvalue < GO_enrich_pvalue.cutoff, 
                               "Under", 
                               "NULL"
                              )
                      )
        )

# Print stats
print(paste("Number GOs BEFORE sig filtering: ", nrow(GOall), sep=''))
## [1] "Number GOs BEFORE sig filtering: 21041"
print(paste("Number GOs AFTER sig filtering:  ", nrow(GOall.filtered), sep=''))
## [1] "Number GOs AFTER sig filtering:  284"
print(paste("Number GOs AFTER sig filtering OVER:  ", nrow(GOall.filtered %>% filter(over_represented_pvalue  < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number GOs AFTER sig filtering OVER:  201"
print(paste("Number GOs AFTER sig filtering UNDER: ", nrow(GOall.filtered %>% filter(under_represented_pvalue < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number GOs AFTER sig filtering UNDER: 83"
print(paste("Number sig BP terms: ", nrow(filter(GOall.filtered, ontology=="BP")), sep=''))
## [1] "Number sig BP terms: 192"
print(paste("Number sig MF terms: ", nrow(filter(GOall.filtered, ontology=="MF")), sep=''))
## [1] "Number sig MF terms: 43"
print(paste("Number sig CC terms: ", nrow(filter(GOall.filtered, ontology=="CC")), sep=''))
## [1] "Number sig CC terms: 36"

Correct any un-annotated terms/ontologies.

NAs.ontology <- GOall.filtered %>% subset(is.na(term))
NAs.ontology
##       category over_represented_pvalue under_represented_pvalue numDEInCat
## 272 GO:0044462            0.0004026618              0.999997456          2
## 273 GO:0030898            0.0047769228              0.999854369          2
## 274 GO:0015307            0.0063233891              1.000000000          1
## 275 GO:0044257            0.0147909874              0.995684810          7
## 276 GO:0035042            0.0289434734              0.999684923          1
## 277 GO:0044265            0.0349708204              0.986850633          8
## 278 GO:0004003            0.0397530084              0.996128229          2
## 279 GO:0033613            0.0486189347              0.994659753          2
## 280 GO:0006928            0.9892080638              0.035336862          3
## 281 GO:0000904            0.9933563634              0.040830321          1
## 282 GO:0010941            0.9994985101              0.004458872          1
## 283 GO:0010942            1.0000000000              0.031163280          0
## 284 GO:0051270            1.0000000000              0.015034446          0
##     numInCat term ontology   dir
## 272        5 <NA>     <NA>  Over
## 273       20 <NA>     <NA>  Over
## 274        1 <NA>     <NA>  Over
## 275      546 <NA>     <NA>  Over
## 276        4 <NA>     <NA>  Over
## 277      817 <NA>     <NA>  Over
## 278       52 <NA>     <NA>  Over
## 279       84 <NA>     <NA>  Over
## 280     1571 <NA>     <NA> Under
## 281      964 <NA>     <NA> Under
## 282     1553 <NA>     <NA> Under
## 283      720 <NA>     <NA> Under
## 284      841 <NA>     <NA> Under

Save significant terms.

write.table(GOall.filtered, file = GO_term_sig.file, row.names=F, quote=F, sep='\t')

4. Find GOslim terms

Run GOslim to get broader categories.

slim <- getOBOCollection("http://current.geneontology.org/ontology/subsets/goslim_generic.obo") # Get GO database

## BP
BP_GO <- GOall.filtered %>%
  filter(ontology=="BP")
BPGO_collection <- GOCollection(BP_GO$category) # Make library of query terms
slims_bp <- data.frame(goSlim(BPGO_collection, slim, "BP")) # Find common parent terms to slim down our list
slims_bp$category <- row.names(slims_bp) # Save rownames as category

## MF
MF_GO <- GOall.filtered %>%
  filter(ontology=="MF")
MFGO_collection <- GOCollection(MF_GO$category) # Make library of query terms
slims_mf <- data.frame(goSlim(MFGO_collection, slim, "MF")) # Find common parent terms to slim down our list
slims_mf$category <- row.names(slims_mf) # Save rownames as category

Get mapped terms, using functions from Sam White’s Biostars post.

# Write function mappedIds to get the query terms that mapped to the slim categories
mappedIds <-function(df, collection, OFFSPRING) { # The command to run requires a dataframe of slim terms, like slims_MF above, your list of query terms, and the offspring from the GOCollection by goSlim
  map <- as.list(OFFSPRING[rownames(df)]) # Subset GOcollection offspring by the rownames of your dataframe
  mapped <- lapply(map, intersect, ids(collection)) # Find the terms that intersect between the subset made above of your query terms and the GOids from the GO collection
  df[["go_terms"]] <- vapply(unname(mapped), paste, collapse = ";", character(1L)) # Add column "go_terms" with matching terms 
  df # Show resulting dataframe
}

# Run function for MF and BP terms
BPslim <- mappedIds(slims_bp, BPGO_collection, GOBPOFFSPRING)
MFslim <- mappedIds(slims_mf, MFGO_collection, GOMFOFFSPRING)

Remove duplicate matches, keeping the broader umbrella term

# BP
BPslim <- filter(BPslim, Count>0 & Term!="biological_process") # Filter out empty slims and term "biological process"
BPsplitted <- strsplit(as.character(BPslim$go_terms), ";") # Split into multiple GO ids
BPslimX <- data.frame(Term = rep.int(BPslim$Term, sapply(BPsplitted, length)), go_term = unlist(BPsplitted)) # List all
BPslimX <- merge(BPslimX, BPslim[,c(1,3:4)], by="Term") # Add back counts, term, and category info
BPslimX <- unique(setDT(BPslimX)[order(go_term, -Count)], by = "go_term") # Remove duplicate offspring terms, keeping only those that appear in the larger umbrella term (larger Count number)
BPslim <- data.frame(slim_term=BPslimX$Term, slim_cat=BPslimX$category, category=BPslimX$go_term) # Rename columns
head(BPslim)
##                          slim_term   slim_cat   category
## 1        cytoskeleton organization GO:0007010 GO:0000226
## 2       vesicle-mediated transport GO:0016192 GO:0000301
## 3                       DNA repair GO:0006281 GO:0000731
## 4 anatomical structure development GO:0048856 GO:0000902
## 5            immune system process GO:0002376 GO:0002698
## 6             reproductive process GO:0022414 GO:0003006
# MF
MFslim <- filter(MFslim, Count>0 & Term!="molecular_function") # Filter out empty slims and term "molecular function"
MFsplitted <- strsplit(as.character(MFslim$go_terms), ";") # Split into multiple GO ids
MFslimX <- data.frame(Term = rep.int(MFslim$Term, sapply(MFsplitted, length)), go_term = unlist(MFsplitted)) # List all
MFslimX <- merge(MFslimX, MFslim[,c(1,3:4)], by="Term")  # Add back counts, term, and category info
MFslimX <- unique(setDT(MFslimX)[order(go_term, -Count)], by = "go_term")  # Remove duplicate offspring terms, keeping only
MFslim <- data.frame(slim_term=MFslimX$Term, slim_cat=MFslimX$category, category=MFslimX$go_term) # Rename columns
head(MFslim)
##            slim_term   slim_cat   category
## 1 catalytic activity GO:0003824 GO:0004019
## 2 catalytic activity GO:0003824 GO:0004364
## 3 catalytic activity GO:0003824 GO:0004745
## 4 catalytic activity GO:0003824 GO:0004844
## 5 catalytic activity GO:0003824 GO:0008172
## 6 catalytic activity GO:0003824 GO:0008336

Add back GO enrichment info for each offspring term.

GO.BP <- right_join(BPslim, filter(GOall.filtered, ontology=="BP"), by="category")
GO.MF <- right_join(MFslim, filter(GOall.filtered, ontology=="MF"), by="category")

5. Make heatmap

Plot heatmap of BP and MF GO slim terms.

term_label_text_size <- 6
slim_label_text_size <- 6

BPplot <- GO.BP %>%
  filter(numInCat>5) %>%
  mutate(term = fct_reorder(term, -over_represented_pvalue)) %>%
  ggplot(aes(x = dir, y = term)) + 
    geom_tile(aes(fill=over_represented_pvalue, width = 1)) + 
    facet_grid(slim_term ~ ontology, scales = "free_y", labeller = label_wrap_gen(width = 10, multi_line = TRUE))+
    theme_bw() +
    theme(panel.border = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.line = element_line(colour = "black"),
          strip.text.y = element_text(angle=0, size = slim_label_text_size, face = "bold"),
          strip.text.x = element_text(size = 12, face = "bold"),
          axis.title.x = element_blank(),
          axis.title.y = element_text(size=15),
          axis.text = element_text(size = term_label_text_size), legend.position = "None",
          plot.margin = unit(c(0,1,0,0.25), "cm")
    ); BPplot

MFplot <- GO.MF %>%
  filter(numInCat>5) %>%
  mutate(term = fct_reorder(term, -over_represented_pvalue)) %>%
  ggplot(aes(x = dir, y = term)) + 
    geom_tile(aes(fill=over_represented_pvalue, width = 1)) + 
    scale_y_discrete(position = "right") +
    facet_grid(slim_term ~ ontology,
               scales = "free_y",
               labeller = label_wrap_gen(width = 10, multi_line = TRUE), 
               switch="y" # Put the y facet strips on the left
              ) +
    theme_bw() +
    theme(panel.border = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.line = element_line(colour = "black"),
          strip.text.y.left = element_text(angle=0, size = slim_label_text_size, face = "bold"),
          strip.text.x = element_text(size = 12, face = "bold"),
          axis.title = element_blank(),
          axis.text = element_text(size = term_label_text_size),
          legend.title = element_text(size = 12),
          legend.text = element_text(size = 11)
    ); MFplot

Combined BP and MF plots.

BPplot + MFplot + plot_annotation(tag_levels = "A", tag_suffix = ")") & theme(plot.tag = element_text(size=15, face="bold"))

6. Kegg enrichment analysis

Select KEGG Orthogroup IDs (KOs) from annotation dataframe.

# Extract gene_ids and KO from annotation file
KO.terms <- annot %>%
  subset(select=c(gene_id, KEGG_IDs)) %>% # Select columns
  drop_na() %>% # Remove rows with missing GO terms or gene IDs
  filter(KEGG_IDs!="-") # Remove rows with missing values indicated by '-'

# Split multiple KEGG IDs per line into multiple lines
splitted <- strsplit(as.character(KO.terms$KEGG_IDs), "; ") # Split into multiple KEGG IDs
KO.terms <- data.frame(v1 = rep.int(KO.terms$gene_id, sapply(splitted, length)), v2 = unlist(splitted)) # List all genes with each of their KEGG IDs in a single row
colnames(KO.terms) <- c("gene_id", "KEGG_IDs")

KO.terms <- unique(KO.terms)
colnames(KO.terms) <- c("gene_id", "GO.ID")
head(KO.terms)
##                                        gene_id  GO.ID
## 1 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1a K00549
## 2 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b K16866
## 3  Pocillopora_acuta_KBHIv2___RNAseq.g18333.t1 K03386
## 4   Pocillopora_acuta_KBHIv2___RNAseq.g7985.t1 K12418
## 5      Pocillopora_acuta_KBHIv2___TS.g15308.t1 K13755
## 6   Pocillopora_acuta_KBHIv2___RNAseq.g2057.t1 K11001
# Bind KO and GO references
GOKO.terms <- bind_rows(GO.terms, KO.terms)

Perform Kegg enrichment with goseq package.

KOall <- goseq(pwf, GOref$gene_id, gene2cat=GOKO.terms, test.cats=c("KEGG"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns

Extract significantly enriched KEGG terms.

KOall.filtered <- KOall %>% 
  filter(over_represented_pvalue < KEGG_enrich_pvalue.cutoff | under_represented_pvalue < KEGG_enrich_pvalue.cutoff) %>% # Filter GO enrichment results by p-value
  arrange(ontology, over_represented_pvalue, -numDEInCat) %>% # Reorder
  mutate(term = factor(term, levels = unique(term))) %>% # Make 'terms' column a factor
  mutate(dir = if_else(over_represented_pvalue < GO_enrich_pvalue.cutoff, 
                       "Over", 
                       if_else(under_represented_pvalue < GO_enrich_pvalue.cutoff, 
                               "Under", 
                               "NULL"
                              )
                      )
        ) %>%
  mutate(ontology = replace_na(ontology, "KEGG")) %>%
  filter(ontology=="KEGG")

# Print stats
print(paste("Number KEGG IDs BEFORE sig filtering: ", nrow(KOall), sep=''))
## [1] "Number KEGG IDs BEFORE sig filtering: 28185"
print(paste("Number KEGG IDs AFTER sig filtering:  ", nrow(KOall.filtered), sep=''))
## [1] "Number KEGG IDs AFTER sig filtering:  63"
print(paste("Number KEGG IDs AFTER sig filtering OVER:  ", nrow(KOall.filtered %>% filter(over_represented_pvalue  < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number KEGG IDs AFTER sig filtering OVER:  58"
print(paste("Number KEGG IDs AFTER sig filtering UNDER: ", nrow(KOall.filtered %>% filter(under_represented_pvalue < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number KEGG IDs AFTER sig filtering UNDER: 5"

Add KO definitions.

# Load definition file
KEGG_IDs_Descriptions <- read.table(KEGG_IDs_Descriptions.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM", quote='') # Read in file

# Prep definition data
KEGG_IDs_Descriptions <- KEGG_IDs_Descriptions %>%
  subset(select=c("D.ID", "D.Description")) %>%
  unique()
colnames(KEGG_IDs_Descriptions) <- c("category", "description")

# Merge with KEGG output
KOall.filtered.descriptions <- unique(left_join(KOall.filtered, KEGG_IDs_Descriptions, by=c("category")))

Write output KEGG enrichment files.

write.table(KOall.filtered.descriptions, file = KEGG_sig.file, row.names=F, quote=F, sep='\t')